/*--------------------------------------------------------------------------------
	DESCRIPTION: Cleaning relevant cross-country datasets

--------------------------------------------------------------------------------*/

cap erase "$data_dir/raw/mpd2023_clean.dta"
cap erase "$data_dir/raw/average_growth_rates_mpd.dta"
cap erase "$data_dir/raw/rare_disasters_mpd_2023.dta"
cap erase "$data_dir/raw/growth_accelerations_mpd_2023.dta"
cap erase "$data_dir/raw/barro_ursua_gdppc.dta"
cap erase "$data_dir/raw/pwt100_clean.dta"
cap erase "$data_dir/raw/pwt100_barro_clean.dta"
cap erase "$data_dir/raw/reinhart_rogoff_crises.dta"
cap erase "$data_dir/raw/nguyen_financial_crises.dta"
cap erase "$data_dir/raw/financial_crises_clean.dta"
cap erase "$data_dir/raw/gfd_inflation1.dta"
cap erase "$data_dir/raw/gfd_inflation2.dta"
cap erase "$data_dir/raw/gfd_inflation_final.dta"
cap erase "$data_dir/raw/banks_cnts.dta"
cap erase "$data_dir/raw/vdem_democracy.dta"
cap erase "$data_dir/raw/average_institutions.dta"
cap erase "$data_dir/raw/democ_autoc_switch_media.dta"
cap erase "$data_dir/raw/natural_shocks.dta"
cap erase "$data_dir/raw/average_gini_coefficients.dta"
cap erase "$data_dir/raw/wid_usa.dta"
cap erase "$data_dir/raw/che_income_inequality.dta"
cap erase "$data_dir/raw/average_intentional_homicides.dta"
cap erase "$data_dir/raw/us_unemployment_annual.dta"
cap erase "$data_dir/raw/swiss_unemployment_annual.dta"
cap erase "$data_dir/raw/shiller_sp500_return.dta"
cap erase "$data_dir/raw/swiss_stock_returns.dta"
cap erase "$data_dir/raw/wb_income_group.dta"

//--------------------------------------------------------
//1) Clean Maddison Project Database (version 2023)
//--------------------------------------------------------

clear
use "$data_dir/raw/maddison2023_web.dta"

keep if year>=1850 & year!=.
rename countrycode ccode

expand 2 if country=="Czechoslovakia" & year>=1920 & year!=.
duplicates tag country year, gen(isdup)
bys country year: gen n = _n if isdup!=0

replace ccode="SVK" if country=="Czechoslovakia" & n==1 & year>=1920 & year<=1984
replace country="Slovakia" if country=="Czechoslovakia" & n==1 & year>=1920 & year<=1984
drop if ccode=="SVK" & gdppc==.

replace ccode="CZE" if country=="Czechoslovakia" & n==2 & year>=1920 & year<=1969
replace country="Czech Republic" if country=="Czechoslovakia" & n==2 & year>=1920 & year<=1969
drop if ccode=="CZE" & gdppc==.

drop if ccode=="CSK"
drop isdup
drop n

sort ccode year
egen panelid = group(ccode)
xtset panelid year

//impute missing gdp series
gen obs = !missing(gdppc)
bys ccode: gen sum_obs = sum(obs)

egen group_id = group(ccode sum_obs)

bys ccode sum_obs: gen min_obs_year = year if _n==1

rangestat (min) next_year=year (min) next_gdppc=gdppc ///
    if obs, interval(year 1 1000) by(ccode)

gen gap = next_year-min_obs_year
	
gen g = (next_gdppc/gdppc)^(1/(gap))-1

gen gdppc_0 = gdppc
	
bys ccode: carryforward g gap min_obs_year gdppc_0 next_year, replace

gen time = gap-(next_year-year)

gen gdppc_imp = gdppc_0*(1+g)^time if time!=0 & year!=2022

drop obs sum_obs group_id min_obs_year next_year next_gdppc gap g gdppc_0 time

replace gdppc = gdppc_imp if gdppc==.
drop gdppc_imp

//generate relevant GDP and growth variables
gen ln_gdppc = ln(gdppc)
xtset panelid year
gen growth_rate = ln_gdppc-l.ln_gdppc

gen growth_shock = 1 if growth_rate<0 & growth_rate!=.
replace growth_shock = 0 if growth_rate>=0 & growth_rate!=.

sort ccode year

order ccode country year gdp pop gdppc ln_gdppc growth_rate growth_shock
drop panelid

save "$data_dir/raw/mpd2023_clean.dta", replace


//--------------------------------------------------------
//2) Generate time-average growth rates
//--------------------------------------------------------

clear
use "$data_dir/raw/mpd2023_clean.dta"

bys ccode: egen avg_10yr_growth = mean(growth_rate) if year>=2009 & year<=2019
collapse avg_10yr_growth, by(ccode)

save "$data_dir/raw/average_growth_rates_mpd.dta", replace


//--------------------------------------------------------
//3) Construct Barro-Ursua rare disasters variable
//--------------------------------------------------------

clear
use "$data_dir/raw/mpd2023_clean.dta"

egen panelid = group(ccode)
xtset panelid year

gen fall = 1 if growth_rate<=0 & growth_rate!=.
gen rise = 1 if growth_rate>0 & growth_rate!=.

gen in_rise = .
replace in_rise = 1 if rise == 1
replace in_rise = 0 if rise != 1 & !missing(rise)

gen tag = .
gen rise_group = .

bysort country (year): gen in_rise_lag = in_rise[_n-1]

gen is_new_rise = .
bysort ccode (year): replace is_new_rise = (in_rise == 1 & (in_rise_lag != 1 | _n == 1))

bysort ccode (year): gen group_id = sum(is_new_rise)

replace rise_group = group_id if in_rise == 1

bys ccode group_id: egen peak_ln_gdppc = max(ln_gdppc) if rise==1
bys ccode group_id: egen trough_ln_gdppc = min(ln_gdppc) if fall==1

local varlist peak_ln_gdppc trough_ln_gdppc
foreach var of local varlist {
	bys ccode group_id: mipolate `var' year, groupwise gen(`var'_)
}

sort ccode year

gen peak_year = year if ln_gdppc==peak_ln_gdppc_ & growth_rate!=0 & rise==1
gen trough_year = year if ln_gdppc==trough_ln_gdppc_ & growth_rate!=0 & fall==1

gen contraction = trough_ln_gdppc_-peak_ln_gdppc_

drop if gdppc==.
drop if group_id==0

collapse contraction peak_year trough_year, by(ccode group_id)
drop if contraction==0
drop if contraction==.

gen disaster = 1 if abs(contraction)>0.1

keep if disaster==1
drop group_id

tostring trough_year, gen(trough_year_string)
tostring peak_year, gen(peak_year_string)

rename peak_year year1
rename trough_year year2

gen trough_year = trough_year_string
drop trough_year_string

gen peak_year = peak_year_string
drop peak_year_string

egen panelid = group(ccode year1)
reshape long year, i(panelid) j(test)

drop test

xtset panelid year
tsfill

bys panelid: carryforward ccode contraction disaster trough_year peak_year, replace

keep ccode year contraction disaster peak_year trough_year
order ccode year contraction disaster peak_year trough_year
sort ccode year

save "$data_dir/raw/rare_disasters_mpd_2023.dta", replace

clear
use "$data_dir/raw/mpd2023_clean.dta"

merge 1:1 ccode year using "$data_dir/raw/rare_disasters_mpd_2023.dta"
drop _merge

replace disaster = 0 if disaster==. & growth_rate!=.

save "$data_dir/raw/mpd2023_clean.dta", replace


//--------------------------------------------------------
//4) Construct Hausman et al. growth accelerations variable
//--------------------------------------------------------

//use code from: https://github.com/KMWacker/growthaccelerations/blob/main/main.do

clear
use "$data_dir/raw/mpd2023_clean.dta"
keep ccode country year gdppc ln_gdppc

drop if year>2014 & year!=.

*Obtaining the least squares growth rate from t to t+7
label var gdppc "real GDP per capita (rgdpna/pop)"
label var ln_gdppc "natural logarithm of real GDP per capita (rgdpna/pop)"
sort ccode year
rangestat(reg) ln_gdppc year, by(ccode) interval(year 0 7)
rename b_year gr0_7
label var gr0_7 "least squares growth rate of real GDP per capita from t to t+7"
replace gr0_7=. if reg_nobs<8
drop reg_nobs reg_r2 reg_adj_r2 b_cons se_year se_cons

*Obtaining the least squares growth rate from t-7 to t
by ccode: gen gr_lag7= gr0_7[_n-7]
label var gr_lag7 "least squares growth rate of real GDP per capita from t-7 to t"

*Obtaining the change in the growth rate at time t
gen change_growth= gr0_7-gr_lag7
label var change_growth "change in the growth rate at time t over horizon 7"

*Obtaining the output level at the end of the acceleration 
by ccode: gen gdppc_Lead7= gdppc[_n+7]
label var gdppc_Lead7 "level of real GDP per capita in year t+7"

*Obtaining the peak output level 
by ccode: generate record = gdppc[1] if _n==1
by ccode: replace record = max(gdppc, record[_n-1]) if missing(record)
label var record "peak level of real GDP per capita up to and including the current year (year t)"

*Obtaining the least squares growth rate from t+7 to t+17 (for sustained episodes)
rangestat(reg) ln_gdppc year, by(ccode) interval(year 7 17)
rename b_year gr7_17
label var gr7_17 "least squares growth rate of real GDP per capita from t+7 to t+17"
replace gr7_17=. if reg_nobs<11
drop reg_nobs reg_r2 reg_adj_r2 b_cons se_year se_cons

*Identifying years for which conditions are met
generate pot_episode=0
replace pot_episode=1 if gr0_7>=0.035 & gr0_7!=. & change_growth>=0.02 & change_growth!=. & gdppc_Lead7>=record & gdppc_Lead7!=.
label var pot_episode "dummy variable which is 1 when the 3 HPR conditions are met and 0 otherwise"

*Splines
gen key=ccode+string(year,"%02.0f")
encode key, gen(key_id)
drop key
rename key_id key
label var key "key for every country+year"

encode country, gen(country_id)

gen fstat=.
label var fstat "fstatistic of a test for equality of two trend slopes of spline regression"

levelsof key if pot_episode==1, local(epi)
foreach i of local epi{
summ year if key == `i', meanonly
local date = r(min)
summ country_id if key == `i', meanonly
local cnt = r(min)
local start = `date' - 7
local end = `date' + 7
mkspline year1 `date' year2 = year
quietly regress ln_gdppc year1-year2 if inrange(year,`start',`end') & country_id == `cnt'
test year1=year2
gen temp3=r(F) 
replace fstat=temp3 if key==`i'
drop temp3 
drop year1 year2
}

*Identifying the year for which the f-statistic is maximized
bysort ccode (year) : gen pot_spell = cond(pot_episode == 0,  0, sum(pot_episode == 1 & pot_episode[_n-1] == 0)) 
egen max = max(fstat) if pot_spell, by(ccode pot_spell) 
gen acceleration=0
replace acceleration=1 if fstat==max
replace acceleration=0 if pot_episode==0
label var acceleration "dummy equal to 1 if a growth acceleration starts in this year"

*Eliminating episodes within 5 years of other episodes 
replace acceleration=0 if acceleration==1 & acceleration[_n-1]==1 | acceleration[_n-2]==1 | acceleration[_n-3]==1 | acceleration[_n-4]==1

*Recovering episodes which are more than 5 years apart from other episodes 
replace acceleration = cond(acceleration==0 & pot_episode==1 & fstat==max & acceleration[_n-1]==0 & acceleration[_n-2]==0 & acceleration[_n-3]==0 & acceleration[_n-4]==0,1,0) if acceleration!=1

*Identifying sustained growth accelerations 
generate sustained=0 if acceleration==1
replace sustained=1 if acceleration==1 & gr7_17>=0.02 & gr7_17!=.
label var sustained "dummy equal to 1 if a sustained growth acceleration starts in this year"

gen accel_start = year if acceleration == 1
gen accel_end = year + 7 if acceleration == 1
replace accel_end = year + 17 if sustained == 1

keep if acceleration==1

sort ccode year
bys ccode: gen episode_id = _n
xtset country_id episode_id

gen prev_episode_end = l.accel_end
gen overlap_lag = 1 if accel_start <prev_episode_end & accel_start!=. & prev_episode_end!=.
gen prev_f_stat = l.fstat
gen drop_sample = 1 if fstat<prev_f_stat & fstat!=. & prev_f_stat!=. & overlap_lag==1

gen forward_episode_start = f.accel_start
gen overlap_lead = 1 if accel_end > forward_episode_start & accel_end!=. & forward_episode_start!=.
gen forward_f_stat = f.fstat
replace drop_sample = 1 if fstat<forward_f_stat & fstat!=. & forward_f_stat!=. & overlap_lead==1

drop if drop_sample==1

gen n_years = accel_end - accel_start + 1
drop if n_years<0 & n_years!=.
expand n_years

bysort ccode episode_id: gen year_new = accel_start + _n - 1

duplicates tag ccode year_new, gen(isdup)
bys ccode isdup: gen isdup_n = _n if isdup!=0 & isdup!=.
drop if isdup_n==2
drop isdup_n

rename accel_start accel_start_year
rename accel_end accel_end_year

keep ccode country year_new acceleration accel_start_year accel_end_year
rename year_new year
order ccode country year
sort ccode year

save "$data_dir/raw/growth_accelerations_mpd_2023.dta", replace

clear
use "$data_dir/raw/mpd2023_clean.dta"

merge 1:1 ccode year using "$data_dir/raw/growth_accelerations_mpd_2023.dta"
drop _merge

replace acceleration = 0 if acceleration==. & growth_rate!=.

save "$data_dir/raw/mpd2023_clean.dta", replace


//--------------------------------------------------------
//5) Clean Barro-Ursua GDP Database
//--------------------------------------------------------

clear
import excel "$data_dir/raw/barro_ursua_macrodataset_1110.xls", sheet("GDP") firstrow
drop in 1
rename GDPpc year
destring year, replace

ds year, not
local countries `r(varlist)'
local i = 1
foreach var of local countries {
    rename `var' country`i'
    local ++i
}

reshape long country, i(year) j(id)
rename country gdppc
destring gdppc, replace

gen country = ""
replace country = "Argentina"      				if id == 1
replace country = "Australia"      				if id == 2
replace country = "Austria"        				if id == 3
replace country = "Belgium"        				if id == 4
replace country = "Brazil"         				if id == 5
replace country = "Canada"         				if id == 6
replace country = "Chile"          				if id == 7
replace country = "China"          				if id == 8
replace country = "Colombia"       				if id == 9
replace country = "Denmark"        				if id == 10
replace country = "Egypt"          				if id == 11
replace country = "Finland"        				if id == 12
replace country = "France"         				if id == 13
replace country = "Germany"        				if id == 14
replace country = "Greece"         				if id == 15
replace country = "Iceland"        				if id == 16
replace country = "India"          				if id == 17
replace country = "Indonesia"      				if id == 18
replace country = "Italy"          				if id == 19
replace country = "Japan"          				if id == 20
replace country = "Korea, Republic of"          if id == 21
replace country = "Mexico"         				if id == 22
replace country = "Malaysia"       				if id == 23
replace country = "Netherlands"    				if id == 24
replace country = "New Zealand"    				if id == 25
replace country = "Norway"         				if id == 26
replace country = "Peru"           				if id == 27
replace country = "Philippines"    				if id == 28
replace country = "Portugal"       				if id == 29
replace country = "Russian Federation"  		if id == 30
replace country = "South Africa"   				if id == 31
replace country = "Singapore"      				if id == 32
replace country = "Spain"          				if id == 33
replace country = "Sri Lanka"      				if id == 34
replace country = "Sweden"         				if id == 35
replace country = "Switzerland"    				if id == 36
replace country = "Taiwan, Province of China"   if id == 37
replace country = "Turkey"         				if id == 38
replace country = "United Kingdom" 				if id == 39
replace country = "Uruguay"        				if id == 40
replace country = "United States"  				if id == 41
replace country = "Venezuela"      				if id == 42

merge m:1 country using "$data_dir/raw/countrycodes.dta", keepusing(ccode)
drop if _merge==2
drop _merge

drop id
order ccode country year gdppc
sort ccode year

egen panelid = group(ccode)
xtset panelid year

//impute missing gdp series
gen obs = !missing(gdppc)
bys ccode: gen sum_obs = sum(obs)

egen group_id = group(country sum_obs)

bys ccode sum_obs: gen min_obs_year = year if _n==1

rangestat (min) next_year=year (min) next_gdppc=gdppc ///
    if obs, interval(year 1 1000) by(ccode)

gen gap = next_year-min_obs_year
	
gen g = (next_gdppc/gdppc)^(1/(gap))-1

gen gdppc_0 = gdppc
	
bys ccode: carryforward g gap min_obs_year gdppc_0 next_year, replace

gen time = gap-(next_year-year)

gen gdppc_imp = gdppc_0*(1+g)^time if time!=0 & year!=2009

drop obs sum_obs group_id min_obs_year next_year next_gdppc gap g gdppc_0 time

replace gdppc = gdppc_imp if gdppc==.
drop gdppc_imp

//generate relevant GDP and growth variables
gen ln_gdppc = ln(gdppc)
xtset panelid year
gen growth_rate = ln_gdppc-l.ln_gdppc
drop panelid
drop if gdppc==.

save "$data_dir/raw/barro_ursua_gdppc.dta", replace


//--------------------------------------------------------
//6) Clean Penn World Tables (PWT)
//--------------------------------------------------------

clear
use "$data_dir/raw/pwt100.dta"

rename countrycode ccode
egen panelid = group(ccode)
xtset panelid year

gen gdppc = rgdpna/pop
rename rgdpna gdp
replace gdp = gdp*1000000

gen ln_gdppc = ln(gdppc)
gen growth_rate = ln_gdppc-l.ln_gdppc

drop panelid
keep ccode year gdppc pop growth_rate

save "$data_dir/raw/pwt100_clean.dta", replace


//--------------------------------------------------------
//7) Combine Barro-Ursua Database and Penn World Tables
//--------------------------------------------------------

clear
use "$data_dir/raw/barro_ursua_gdppc.dta"

keep ccode year gdppc
rename gdppc barro_gdppc

merge 1:1 ccode year using "$data_dir/raw/pwt100_clean.dta", keepusing(gdppc)
drop _merge

gen real_gdppc_2006 = gdppc if year==2006
replace real_gdppc_2006 = real_gdppc_2006/100

sort ccode year
bys ccode: mipolate real_gdppc_2006 year, groupwise gen(factor)
sort ccode year

gen barro_gdppc_to_pwt = barro_gdppc*factor

rename gdppc pwt_gdppc

gen gdppc = barro_gdppc_to_pwt
replace gdppc = pwt_gdppc if gdppc==. & pwt_gdppc!=.

gen ln_gdppc = ln(gdppc)

egen panelid = group(ccode)
xtset panelid year

gen growth_rate = d.ln_gdppc

keep ccode year growth_rate ln_gdppc

save "$data_dir/raw/pwt100_barro_clean.dta", replace


//--------------------------------------------------------
//8) Gather financial crises data
//--------------------------------------------------------

//REINHART AND ROGOFF DATA
clear
import excel "$data_dir/raw/20160923_global_crisis_data.xlsx", sheet("Sheet1") firstrow
drop in 1

replace InflationAnnualpercentagesof="" if InflationAnnualpercentagesof=="n/a"
replace BankingCrisis="" if BankingCrisis=="n/a"
destring InflationAnnualpercentagesof CurrencyCrises InflationCrises BankingCrisis, replace
rename CC3 ccode
rename Year year

replace ccode = "HUN" if ccode=="HUN "

keep ccode year InflationAnnualpercentagesof CurrencyCrises BankingCrisis

rename InflationAnnualpercentagesof inflation
rename BankingCrisis bank_crisis
rename CurrencyCrises currency_crisis

save "$data_dir/raw/reinhart_rogoff_crises.dta", replace

//NGUYEN, CASTRO AND WOOD DATA
clear
import excel "$data_dir/raw/nguyen_financial_crises.xlsx", firstrow

rename BankingCrises bank_crisis
rename CurrencyCrises currency_crisis
rename Country country
rename Year year

drop if year==.

replace country = "Bahamas" if country=="Bahamas, The"
replace country = "Cape Verde" if country=="Cabo Verde"
replace country = "Democratic Republic of the Congo" if country=="Congo, Dem. Rep."
replace country = "Congo" if country=="Congo, Rep."
replace country = "Côte d'Ivoire" if country=="Cote d'Ivoire"
replace country = "Curaçao" if country=="Curacao"
replace country = "Egypt" if country=="Egypt, Arab Rep."
replace country = "Swaziland" if country=="Eswatini"
replace country = "Gambia" if country=="Gambia, The"
replace country = "Hong Kong" if country=="Hong Kong SAR, China"
replace country = "Iran, Islamic Republic of" if country=="Iran, Islamic Rep."
replace country = "Korea, Republic of" if country=="Korea, Rep."
replace country = "Kyrgyzstan" if country=="Kyrgyz Republic"
replace country = "Lao People's Democratic Republic" if country=="Lao PDR"
replace country = "Macao" if country=="Macao SAR, China"
replace country = "Micronesia, Federated States of" if country=="Micronesia, Fed. Sts."
replace country = "Moldova, Republic of" if country=="Moldova"
replace country = "Macedonia, the Former Yugoslav Republic of" if country=="North Macedonia"
replace country = "Slovakia" if country=="Slovak Republic"
replace country = "Saint Kitts and Nevis" if country=="St. Kitts and Nevis"
replace country = "Saint Lucia" if country=="St. Lucia"
replace country = "Saint Vincent and the Grenadines" if country=="St. Vincent and the Grenadines"
replace country = "Taiwan, Province of China" if country=="Taiwan, China"
replace country = "United Republic of Tanzania" if country=="Tanzania"
replace country = "Venezuela" if country=="Venezuela, RB"
replace country = "Viet Nam" if country=="Vietnam"
replace country = "US Virgin Islands" if country=="Virgin Islands (U.S.)"
replace country = "Yemen" if country=="Yemen, Rep."

merge m:1 country using "$data_dir/raw/countrycodes.dta", keepusing(ccode)
keep if _merge==3
drop _merge

keep ccode year bank_crisis currency_crisis

save "$data_dir/raw/nguyen_financial_crises.dta", replace

//COMBINE
clear
use "$data_dir/raw/nguyen_financial_crises.dta"

local varlist bank_crisis currency_crisis
foreach var of local varlist {
	rename `var' `var'_nguyen
}

merge 1:1 ccode year using "$data_dir/raw/reinhart_rogoff_crises.dta", nogen

local varlist bank_crisis currency_crisis
foreach var of local varlist {
	rename `var' `var'_rr
}

local varlist bank_crisis currency_crisis
foreach var of local varlist {
	replace `var'_nguyen = `var'_rr if `var'_nguyen==. & `var'_rr!=.
	rename `var'_nguyen `var'
}

//ad hoc updates for recent years (2020 - 2022)
merge 1:1 ccode year using "$data_dir/raw/mpd2023_clean.dta", keepusing(ccode year growth_rate)
drop if _merge==2 & year<2020 & year!=.
drop _merge
sort ccode year

replace currency_crisis = 1 if ccode == "TUR" & year == 2021 // Lira collapse
replace currency_crisis = 1 if ccode == "LKA" & year == 2022 // Rupee devaluation after float
replace currency_crisis = 1 if ccode == "PAK" & inrange(year, 2021, 2022) // Rupee depreciation >20%
replace currency_crisis = 1 if ccode == "LBN" & inrange(year, 2020, 2022) // Pound collapse
replace currency_crisis = 1 if ccode == "ZWE" & inrange(year, 2020, 2022) // Hyperinflation and ZWL depreciation

local varlist bank_crisis currency_crisis
foreach var of local varlist {
	replace `var' = 0 if missing(`var') & year>=2020 & year<=2022
}

gen no_twin_crisis = bank_crisis + currency_crisis
gen twin_crisis = 1 if no_twin_crisis>=1 & no_twin_crisis!=.
replace twin_crisis = 0 if bank_crisis==0 & currency_crisis==0

gen contraction = growth_rate if growth_rate<0 & growth_rate!=.
replace contraction = 0 if growth_rate>=0 & growth_rate!=.

local varlist bank_crisis currency_crisis twin_crisis
foreach var of local varlist {
	gen `var'_w = `var'*contraction
}

keep ccode year bank_crisis currency_crisis twin_crisis bank_crisis_w currency_crisis_w twin_crisis_w

save "$data_dir/raw/financial_crises_clean.dta", replace


//--------------------------------------------------------
//9) Gather inflation data
//--------------------------------------------------------

forvalues i = 1/2 {
	clear
	import delimited "$data_dir/raw/gfd_inflation`i'.csv", encoding(UTF-8)
	drop if seriestype == "Consumer Price Indices"
	nrow
	keep Date Ticker Open
	destring Open, gen(inflation)
	drop Open
	
	gen year = regexs(0) if regexm(Date, "[0-9][0-9][0-9][0-9]")
	
	gen ccode = Ticker
	replace ccode = subinstr(ccode,"CP", "", .)
	replace ccode = subinstr(ccode,"MAPC", "", .)
	
	destring year, replace
	
	order ccode year inflation
	keep ccode year inflation
	rename inflation inflation_gfd
	
	save "$data_dir/raw/gfd_inflation`i'.dta", replace
}

//COMBINE
clear
use "$data_dir/raw/gfd_inflation1.dta"
append using "$data_dir/raw/gfd_inflation2.dta"

drop if ccode=="TRNC" | ccode=="V"

sort ccode year
merge 1:1 ccode year using "$data_dir/raw/reinhart_rogoff_crises.dta", keepusing(inflation)
sort ccode year

rename inflation inflation_rr

gen inflation = inflation_gfd
replace inflation = inflation_rr if inflation_gfd==. & inflation_rr!=. //use Reinhart and Rogoff values if missing in GFD

cap drop winsz_inflation
gen winsz_inflation = inflation
summ inflation, de
replace winsz_inflation = r(p99) if inflation>r(p99) & inflation!=.
replace winsz_inflation = r(p1) if inflation<r(p1) & inflation!=.

keep if year<=2022 & year!=.
keep ccode year inflation winsz_inflation

save "$data_dir/raw/gfd_inflation_final.dta", replace


//--------------------------------------------------------
//10) Gather World Bank data
//--------------------------------------------------------

//POPULATION
clear
wbopendata, indicator(SP.POP.TOTL) year(1960:2022) nometadata

keep countrycode yr*
rename countrycode ccode
egen panelid = group(ccode)
reshape long yr, i(panelid) j(year)
rename yr population 
drop panelid

save "$data_dir/raw/pop_wb_1960_present.dta", replace

//GINI COEFFICIENTS
clear
wbopendata, indicator(SI.POV.GINI) year(1961:2023) nometadata

keep countrycode yr*
rename countrycode ccode
egen panelid = group(ccode)
reshape long yr, i(panelid) j(year)
rename yr gini 
drop panelid

bys ccode: egen avg_20yr_gini = mean(gini) if year>=2000 & year<=2023
bys ccode: egen avg_10yr_gini = mean(gini) if year>=2009 & year<=2019

collapse avg_20yr_gini avg_10yr_gini, by(ccode)

sum avg_20yr_gini, de
gen equal = 0 if avg_20yr_gini>r(p50) & avg_20yr_gini!=.
replace equal = 1 if avg_20yr_gini<=r(p50) & avg_20yr_gini!=.

save "$data_dir/raw/average_gini_coefficients.dta", replace

//INTENTIONAL HOMICIDES
clear
wbopendata, indicator(VC.IHR.PSRC.P5) year(1961:2022) nometadata

keep countrycode yr*
rename countrycode ccode
egen panelid = group(ccode)
reshape long yr, i(panelid) j(year)
rename yr homicides
drop panelid

bys ccode: egen avg_10yr_homicides = mean(homicides) if year>=2009 & year<=2019

collapse avg_10yr_homicides, by(ccode)

save "$data_dir/raw/average_intentional_homicides.dta", replace


//--------------------------------------------------------
//11) Gather political crises data
//--------------------------------------------------------

clear
import excel "$data_dir/raw/2025 Edition CNTSDATA.xlsx", sheet("2024 Data")
drop in 1

foreach var of varlist * {
    rename `var' `=`var'[1]'
}

drop in 1

rename Wbcode ccode
drop code

replace ccode = "DEU" if country=="German FR"
replace ccode = "RUS" if country=="Russia"
replace ccode = "RUS" if country=="USSR (Russia)"

expand 2 if country=="Czechoslovakia"
duplicates tag country year, gen(isdup)
bys country year: gen n = _n if isdup!=0

replace ccode="SVK" if country=="Czechoslovakia" & n==1
replace ccode="CZE" if country=="Czechoslovakia" & n==2

drop if ccode==""
drop in 1760/1769

keep ccode year polit12 domestic1 domestic2 domestic3 domestic6 domestic7 domestic8

local varlist year polit12 domestic1 domestic2 domestic3 domestic6 domestic7 domestic8
foreach var of local varlist {
	destring `var', replace
}

rename domestic1 no_assassinations
rename domestic2 no_strikes
rename domestic3 no_guerilla
rename domestic6 no_riots
rename domestic7 no_revolutions
rename domestic8 no_anti_govt_demon
rename polit12 no_exec_changes

local varlist assassinations strikes guerilla riots revolutions anti_govt_demon exec_changes
foreach var of local varlist {
	gen `var' = 1 if no_`var'>0 & no_`var'!=.
	replace `var' = 0 if no_`var'==0
}

gen unrest = 1 if riots==1 | revolutions==1 | anti_govt_demon==1 | guerilla==1 | strikes==1 | assassinations==1
replace unrest = 0 if riots==0 & revolutions==0 & anti_govt_demon==0 & guerilla==0 & strikes==0 & assassinations==0

gen no_unrest = no_riots + no_revolutions + no_anti_govt_demon + no_guerilla + no_strikes + no_assassinations

merge 1:1 ccode year using "$data_dir/raw/mpd2023_clean.dta", keepusing(pop)
drop if _merge==2
drop _merge
replace pop = pop*1000
merge 1:1 ccode year using "$data_dir/raw/pop_wb_1960_present.dta", keepusing(population)
drop if _merge==2
drop _merge

sort ccode year
replace pop = population if pop==. & population!=.
drop population

bys ccode: ipolate pop year, gen(pop_ipo) //interpolate population
drop pop
rename pop_ipo pop

gen prop_unrest = no_unrest/pop

keep ccode year no_exec_changes prop_unrest

save "$data_dir/raw/banks_cnts.dta", replace


//--------------------------------------------------------
//12) Gather democracy and institutions data
//--------------------------------------------------------

//CORE COUNTRY-YEAR DEMOCRACY + INSTITUTIONS PANEL (VDEM)
clear
use "$data_dir/raw/V-Dem-CY-Core-v15.dta"

keep if year<=2023 & year!=.

rename country_text_id ccode

gen democracy = 1 if v2x_regime==2|v2x_regime==3
replace democracy = 0 if v2x_regime==0|v2x_regime==1

gen v2x_exec = (v2x_jucon+v2xlg_legcon)/2
replace v2x_exec = v2x_jucon if v2x_exec==. & v2xlg_legcon==.
replace v2x_exec = v2xlg_legcon if v2x_exec==. & v2x_jucon==.

keep ccode year democracy v2x_polyarchy v2xel_frefair v2x_jucon v2xlg_legcon v2x_corr v2mecenefm v2x_freexp v2x_exec v2csreprss

save "$data_dir/raw/vdem_democracy.dta", replace

//10-YEAR AVERAGE INSTITUTIONAL EXPERIENCE
clear
use "$data_dir/raw/vdem_democracy.dta"

bys ccode: egen avg_10yr_exec = mean(v2x_exec) if year>=2009 & year<=2019
bys ccode: egen avg_10yr_media = mean(v2mecenefm) if year>=2009 & year<=2019
bys ccode: egen avg_10yr_repress = mean(v2csreprss) if year>=2009 & year<=2019
bys ccode: egen avg_10yr_freexp = mean(v2x_freexp) if year>=2009 & year<=2019

collapse avg_10yr_exec avg_10yr_media avg_10yr_repress avg_10yr_freexp, by(ccode)

save "$data_dir/raw/average_institutions.dta", replace

//ALWAYS DEMOCRACIES, ALWAYS AUTOCRACIES AND SWITCHERS
clear
use "$data_dir/raw/vdem_democracy.dta"

keep if year>=1990 & year<=2023
gen dummy = 1

collapse (sum) democracy dummy (mean) v2mecenefm, by(ccode)
rename dummy total_obs
rename democracy no_years_democracy

gen democracy = 1 if no_years_democracy == total_obs 
gen autocracy = 1 if no_years_democracy == 0
gen switcher = 1 if democracy==. & autocracy==.

local varlist democracy autocracy switcher
foreach var of local varlist {
	replace `var'=0 if missing(`var')
}

xtile media_freedom = v2mecenefm, n(3)

summ v2mecenefm, de
gen high_media_freedom = 1 if v2mecenefm>=r(p75) & v2mecenefm!=.
gen med_media_freedom = 1 if v2mecenefm>=r(p50) & v2mecenefm<r(p75) & v2mecenefm!=.
gen low_media_freedom = 1 if v2mecenefm<r(p50) & v2mecenefm!=.

keep ccode democracy autocracy switcher high_media_freedom med_media_freedom low_media_freedom

local varlist high_media_freedom med_media_freedom low_media_freedom
foreach var of local varlist {
	replace `var'=0 if missing(`var')
}

save "$data_dir/raw/democ_autoc_switch_media.dta", replace


//--------------------------------------------------------
//13) Gather epidemics data
//--------------------------------------------------------

clear
import excel "$data_dir/raw/public_emdat_custom_request_2023-11-14_5df26ea7-32e9-4167-8c6f-e71b29183629.xlsx", sheet("EM-DAT Data") firstrow

gen duration = EndYear-StartYear

foreach i in 1 2 3 4 5 6 9 {
	local j = `i' + 1
	expand `j' if duration==`i'
	duplicates tag DisNo, gen(isdup)
	bys DisNo: gen n = _n
	
	replace StartYear = StartYear+n-1 if isdup>0 & isdup!=.
	
	sum n
	replace TotalAffected = TotalAffected/r(max) if duration==`i' //assume uniform annual increase

	drop n isdup
}

drop duration

rename StartYear year

rename ISO ccode

bys ccode year: egen total_affect = total(TotalAffected)
bys ccode year: egen total_affect_epidemic = total(TotalAffected) if DisasterType=="Epidemic"

collapse total_affect total_affect_epidemic, by(ccode year)

egen panelid = group(ccode)

xtset panelid year
tsfill

bys panelid: carryforward ccode, replace
xtset panelid year

merge 1:1 ccode year using "$data_dir/raw/mpd2023_clean.dta", keepusing(pop)
drop if _merge==2
drop _merge
replace pop = pop*1000
merge 1:1 ccode year using "$data_dir/raw/pop_wb_1960_present.dta", keepusing(population)
drop if _merge==2
drop _merge

sort ccode year
replace pop = population if pop==. & population!=.
drop population panelid

bys ccode: ipolate pop year, gen(pop_ipo)
drop pop
rename pop_ipo pop

replace total_affect = 0 if missing(total_affect)
replace total_affect_epidemic = 0 if missing(total_affect_epidemic)

gen prop_affect = total_affect/pop
replace prop_affect = 0 if total_affect==0

gen prop_affect_epidemic = total_affect_epidemic/pop
replace prop_affect_epidemic = 0 if total_affect_epidemic==0
replace prop_affect_epidemic = . if prop_affect==.

drop pop total_affect total_affect_epidemic prop_affect

save "$data_dir/raw/natural_shocks.dta", replace


//--------------------------------------------------------
//14) Inequality for USA and Switzerland
//--------------------------------------------------------

//WID FOR USA
clear
wid, indicators(sptinc) perc(p90p100 p99p100) area(US) ages(992) pop(j)
rename country ccode
replace ccode = "USA"
drop age pop variable

gen value_01 = value if percentile=="p99p100"
replace value = . if percentile=="p99p100"
rename value p90p100
rename value_01 p99p100
collapse p90p100 p99p100, by(ccode year)

tsset year
tsfill

carryforward ccode, replace

drop if year<1880

local varlist p90p100 p99p100
foreach var of local varlist {
	ipolate `var' year, gen(ipo_`var')
}

drop p90p100 p99p100

rename ipo_p90p100 pretax_income_top10
rename ipo_p99p100 pretax_income_top1

save "$data_dir/raw/wid_usa.dta", replace

//SWISS INEQUALITY DATA
clear
use "$data_dir/raw/swiss_inequality_database.dta"

append using "$data_dir/raw/schaltegger_income_data.dta"

sort year

save "$data_dir/raw/che_income_inequality.dta", replace


//--------------------------------------------------------
//15) Unemployment data (US and Swiss)
//--------------------------------------------------------

//USA UNEMPLOYMENT DATA
clear
use "$data_dir/raw/us_unemployment_raw_q.dta"

collapse (mean) Civilianunemploymentrate, by(panel_year)

rename Civilianunemploymentrate unemployment_rate
rename panel_year year

replace unemployment_rate = unemployment_rate/100

save "$data_dir/raw/us_unemployment_annual.dta", replace

//SWISS UNEMPLOYMENT DATA
clear
import excel "$data_dir/raw/switzerland_unemployment_rate_GFD.xlsx", firstrow

drop Ticker

gen date_s = date(Date, "MDY")
format date_s %td
gen year = year(date_s)

rename UnemploymentRate unemployment_rate
replace unemployment_rate = unemployment_rate/100

keep unemployment_rate year

save "$data_dir/raw/swiss_unemployment_annual.dta", replace


//--------------------------------------------------------
//16) Stock market return data (US and Swiss)
//--------------------------------------------------------

//USA STOCK PRICE
clear
import excel "$data_dir/raw/ie_data.xls", sheet("Data")

drop in 1/8

keep A J H

rename A date
rename J realtotalreturnprice
rename H realprice

destring date, replace
destring realprice, replace
gen year = round(date)
bys year: gen month = _n

keep if month==12

tsset year
gen sp500return = (realprice-l.realprice)/l.realprice

keep year sp500return

save "$data_dir/raw/shiller_sp500_return.dta"

//SWISS STOCK PRICE INDEX
clear
import excel "$data_dir/raw/swiss_stock_index_GFD.xlsx", firstrow sheet("Price Data")

keep Date Ticker Real_Close
drop Ticker

gen date_s = date(Date, "MDY")
format date_s %td
gen year = year(date_s)

drop Date date_s
sort year

tsset year
gen stock_return = (Real_Close-l.Real_Close)/Real_Close

tsfill
ipolate stock_return year, gen(stock_return_ipo)

drop stock_return
rename stock_return_ipo stock_return

keep stock_return year
drop if year>2022
drop if year==1899

save "$data_dir/raw/swiss_stock_returns.dta", replace


//--------------------------------------------------------
//17) World Bank income classification (as of Nov 2025)
//--------------------------------------------------------

clear
import excel "$data_dir/raw/CLASS.xlsx", sheet("List of economies") firstrow
drop if Region==""

rename Code ccode
rename Incomegroup income_group

keep ccode income_group
drop if income_group==""

gen high_income = 1 if income_group=="High income"
gen low_income = 1 if income_group=="Low income"
gen lowmid_income = 1 if income_group=="Lower middle income"
gen upmid_income = 1 if income_group=="Upper middle income"

gen middle_income = 1 if lowmid_income==1 | upmid_income==1

local varlist high_income low_income lowmid_income upmid_income middle_income
foreach var of local varlist {
	replace `var' = 0 if missing(`var')
}

save "$data_dir/raw/wb_income_group.dta", replace

